from sklearn import metrics
from scipy.stats import multivariate_normal
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from scipy.io import loadmat
from scipy import optimize
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
import tensorflow as tf
from sklearn import svm
from sklearn.model_selection import train_test_split
pd.set_option('mode.chained_assignment', None)
pio.templates.default = "plotly_dark"
data = loadmat('ex8/ex8data1.mat')
X = data['X']
y = data['yval']
df = pd.DataFrame(X)
df.columns = ['x1', 'x2']
df['y'] = y
fig = px.scatter(df, x='x1', y='x2', color='y')
fig.update_layout(
width=800,
height=500
)
fig.update_yaxes(
scaleanchor="x",
scaleratio=1,
)
fig.show()
mu = df[['x1', 'x2']].mean()
cov = df[['x1', 'x2']].cov()
gaussian = multivariate_normal(mu.T, cov)
feature_x = np.linspace(df[['x1', 'x2']].min()[0],
df[['x1', 'x2']].max()[0], 100)
feature_y = np.linspace(df[['x1', 'x2']].min()[1],
df[['x1', 'x2']].max()[1], 100)
x_mesh, y_mesh = np.meshgrid(feature_x, feature_y)
pos = np.dstack((x_mesh, y_mesh))
z = gaussian.pdf(pos)
fig2 = go.Figure(fig)
fig2.add_trace(
go.Contour(
x=feature_x,
y=feature_y,
z=z,
line_width=1.5,
contours_coloring='lines',
)
)
fig2.show()
def evaluate_f_score(df, dist, epsilon, features=['x1', 'x2'], target='y'):
pred_y = dist.pdf(df[features]) < epsilon
return metrics.f1_score(pred_y, df[target])
epsilon_scores = ((e, evaluate_f_score(df, gaussian, e))
for e in np.linspace(0.01, 0, 1000))
score_df = pd.DataFrame(data=epsilon_scores, columns=['epsilon', 'f_score'])
best_score = score_df['f_score'].max()
best_epsilon_mean = score_df[score_df['f_score']
== best_score]['epsilon'].mean()
all_epsilons = score_df.groupby(['f_score']).mean().reset_index().iloc[1:]
fig3 = go.Figure(fig)
for num, r in all_epsilons.iterrows():
fig3.add_trace(
go.Contour(
x=feature_x,
y=feature_y,
z=z*1000000,
line_width=1.5,
contours_coloring='lines',
autocontour=False,
showlegend=False,
showscale=False,
colorscale='greens',
name=f"{r['epsilon']} * e-6",
contours={'start': r['epsilon']*1000000,
'end': r['epsilon']*1000000,
'showlabels': True,
}
)
)
fig3.add_trace(
go.Contour(
x=feature_x,
y=feature_y,
z=z*1000000,
line_width=1.5,
contours_coloring='lines',
colorscale='hsv',
autocontour=False,
showlegend=False,
showscale=False,
name=f"Best epsilon: {round(best_epsilon_mean*1000000)}",
contours={'start': best_epsilon_mean*1000000,
'end': best_epsilon_mean*1000000,
'showlabels': True,
}
)
)
fig3.show()
data = loadmat('ex8/ex8_movies.mat')
y_original = data['Y']
r_original = data['R']
movies_num = y_original.shape[0]
users_num = y_original.shape[1]
expected_features = 100
x = np.random.rand(movies_num, expected_features)
theta = np.random.rand(users_num, expected_features)
def x_grad_fun(input_vector, x_shape, theta_shape, y, r):
x = input_vector[:np.prod(x_shape)].reshape(x_shape)
theta = input_vector[np.prod(x_shape):].reshape(theta_shape)
return (((x@theta.T - y) * r) @ theta).flatten()
def theta_grad_fun(input_vector, x_shape, theta_shape, y, r):
x = input_vector[:np.prod(x_shape)].reshape(x_shape)
theta = input_vector[np.prod(x_shape):].reshape(theta_shape)
return (((x@theta.T - y) * r).T @ x).flatten()
def flatten_grad_func(input_vector, x_shape, theta_shape, y, r):
x_grad = x_grad_fun(input_vector, x_shape, theta_shape, y, r)
theta_grad = theta_grad_fun(input_vector, x_shape, theta_shape, y, r)
return np.concatenate((x_grad, theta_grad))
def cost_function_colab_filter(input_vector, x_shape, theta_shape, y, r):
x = input_vector[:np.prod(x_shape)].reshape(x_shape)
theta = input_vector[np.prod(x_shape):].reshape(theta_shape)
cost = np.sum(np.square(((x@theta.T - y) * r))) / 2
grad = flatten_grad_func(input_vector, x_shape, theta_shape, y, r)
return cost, grad
# Unreduced data requires ~200gb of memory
num_users = 100
num_movies = 50
num_features = 25
y = y_original[:num_movies, 0:num_users]
r = r_original[:num_movies, 0:num_users]
init_guess = np.random.rand(num_movies*num_features + num_users*num_features)
x_shape = (num_movies,num_features)
theta_shape = (num_users,num_features)
def train_recommender_system(
cost_fun, learning_rate=0.01, iterrations=1000, *args
):
cost_history = []
params_history = []
params = args[0]
other_args = args[1:]
for i in range(iterrations):
params_to_pass = [params,*other_args]
cost, grad = cost_fun(*params_to_pass)
print(cost, end='\r')
cost_history.append(cost)
params_history.append(params)
params = params - learning_rate * grad
return cost_history, params_history, params
start_params = [init_guess, x_shape, theta_shape, y, r]
c_h, p_h, p = train_recommender_system(
cost_function_colab_filter,
0.01,
1000,
*start_params
)
1.332942026385068e-099
x_fin = p[:np.prod(x_shape)].reshape(x_shape)
theta_fin = p[np.prod(x_shape):].reshape(theta_shape)
fitted_ratings = x_fin@theta_fin.T
Lets predict best movies for person #5
The code below gives rates movies and provides index for the movie (since we do not have names of movies)
sorted(enumerate(fitted_ratings[:,5]),key=lambda x: x[1],reverse = True)
[(13, 4.99999904610127), (3, 4.179536581038713), (30, 4.060959232313805), (11, 3.9999998181588414), (18, 3.9999997894156314), (31, 3.99999957314985), (7, 3.9999992638869077), (22, 3.999999132047354), (0, 3.9999988415025585), (8, 3.9999981541476974), (49, 3.999997394076476), (44, 3.7228355319760387), (10, 3.325965336949985), (16, 3.0380419325888433), (21, 3.000000744721721), (46, 3.000000377968602), (20, 2.999999737824136), (14, 2.9999994846386437), (5, 2.9858071251158766), (25, 2.977505184451111), (32, 2.863899507626943), (4, 2.8503867109315135), (41, 2.8401079310464366), (43, 2.833473091883081), (17, 2.7936696725678645), (15, 2.7018193498697536), (9, 2.375699825729945), (38, 2.3269879329537067), (39, 2.2591515739054238), (24, 2.2511504336215715), (29, 2.232499163136199), (26, 2.047735420327976), (48, 2.004425496133342), (19, 2.003858509274897), (27, 1.9999989465968062), (12, 1.9999987293449637), (6, 1.9999972335000793), (47, 1.719334338341354), (40, 1.7077637872649107), (36, 1.5452810356469124), (37, 1.4904125188027548), (45, 1.3881525185902068), (33, 1.1679211886025231), (42, 1.0705354410282475), (1, 1.0139900821602423), (2, 0.9238881058815922), (34, 0.6399848922330877), (28, 0.6129042519002814), (23, 0.4968897419910422), (35, 0.07203087221926735)]